datadir <- '~/UROP'
source(file.path(datadir, 'R/unsupervised.R'))
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
## Attaching SeuratObject
library(ggpubr)
## Loading required package: ggplot2

Introduction

We now demonstrate a semi-supervised approach using a cancer-containing spatial dataset that has initial cell type labels. The initial cancer-cell labels were determined using known marker genes, while the other labels were generated with RCTD.

Data preprocessing

First, we add the cancer cell labels to the raw RCTD object and augment the original SpatialRNA object with the full list of genes.

myRCTD <- readRDS(file.path(datadir, '/Data/myRCTD_201014_03.rds'))
results <- read.csv(file = file.path(datadir, '/Data/results_processed_slideseq_data_2020-12-12_Puck_201014_03_2020-12-12_Puck_201014_03_cell_types.csv'))

cancer_results <- results[results$HRS_location == TRUE,]
results <- myRCTD@results$results_df
results$first_type = factor(results$first_type, levels=append(levels(results$first_type), 'Cancer_cells'))
results$second_type = factor(results$second_type, levels=append(levels(results$second_type), 'Cancer_cells'))
results[cancer_results$barcodes, 'spot_class'] <- 'singlet'
results[cancer_results$barcodes,'first_type'] <- 'Cancer_cells'
# below modifies cancer cell weight
for (barcode in cancer_results$barcodes) {
  if (barcode %in% rownames(myRCTD@results$weights_doublet)) {
    myRCTD@results$weights_doublet[barcode,'first_type'] <- 1
  }
}

myRCTD@results$results_df <- results
myRCTD@cell_type_info$info[[2]] <- append(myRCTD@cell_type_info$info[[2]], 'Cancer_cells')
myRCTD@cell_type_info$renorm[[2]] <- append(myRCTD@cell_type_info$renorm[[2]], 'Cancer_cells')
myRCTD@cell_type_info$info[[3]] <- myRCTD@cell_type_info$info[[3]] + 1
myRCTD@cell_type_info$renorm[[3]] <- myRCTD@cell_type_info$renorm[[3]] + 1
class_df <- myRCTD@internal_vars$class_df
class_df['Cancer_cells', 'class'] = 'Cancer_cells'
myRCTD@internal_vars$class_df <- class_df
myRCTD@config$max_cores <- 4

counts <- read.csv(file = file.path(datadir, '/Data/MappedDGEForR.csv'))
rownames(counts) <- counts$GENE; counts$GENE <- NULL
nUMI <- colSums(counts)
coords <- myRCTD@originalSpatialRNA@coords
puck <- SpatialRNA(coords, counts, nUMI)
myRCTD@originalSpatialRNA <- puck
saveRDS(puck, file.path(datadir, '/Data/cancer_puck.rds'))
saveRDS(myRCTD, file.path(datadir, '/Objects/vignette_v1/cancer_RCTD_initial.rds'))

Running semisupervised learning

We now run the unsupervised learning algorithm with the updated gene list and the new labels, using CSIDE on the new labels to generate our initialization.

myRCTD <- readRDS(file.path(datadir, 'Objects/vignette_v1/cancer_RCTD_initial.rds'))
RCTD_list <- run.de.initialization(myRCTD, convergence_thresh = 0.995)
saveRDS(RCTD_list, file.path(datadir, '/Objects/vignette_v1/cancer_RCTD_final.rds'))

Plotting cell types

We can plot the cell types spatially before and after the learning algorithm to compare the assignments.

RCTD_init <- readRDS(file.path(datadir, 'Objects/vignette_v1/cancer_RCTD_initial.rds'))
RCTD_list <- readRDS(file.path(datadir, 'Objects/vignette_v1/cancer_RCTD_final.rds'))
RCTD_pred <- RCTD_list[[length(RCTD_list)]]
plot_all_cell_types(RCTD_init@results$results_df, RCTD_init@originalSpatialRNA@coords, RCTD_init@cell_type_info$renorm[[2]], '..')

plot_all_cell_types(RCTD_pred@results$results_df, RCTD_pred@originalSpatialRNA@coords, RCTD_pred@cell_type_info$renorm[[2]], '..')

Marker genes

We can also find marker genes for the new cancer cell type.

get_marker_data <- function(cell_type_names, cell_type_means, gene_list) {
  marker_means = cell_type_means[gene_list,]
  marker_norm = marker_means / rowSums(marker_means)
  marker_data = as.data.frame(cell_type_names[max.col(marker_means)])
  marker_data$max_epr <- apply(cell_type_means[gene_list,],1,max)
  colnames(marker_data) = c("cell_type",'max_epr')
  rownames(marker_data) = gene_list
  marker_data$log_fc <- 0
  epsilon <- 1e-9
  for(cell_type in unique(marker_data$cell_type)) {
    cur_genes <- gene_list[marker_data$cell_type == cell_type]
    other_mean = rowMeans(cell_type_means[cur_genes,cell_type_names != cell_type])
    marker_data$log_fc[marker_data$cell_type == cell_type] <- log(epsilon + cell_type_means[cur_genes,cell_type]) - log(epsilon + other_mean)
  }
  return(marker_data)
}
cur_cell_types <- c('CD4_T_cells', 'CD8_T_cells', 'Monocytes_macrophages', 'Cancer_cells')
puck <- RCTD_pred@spatialRNA
cell_type_info_restr = RCTD_pred@cell_type_info$info
cell_type_info_restr[[1]] = cell_type_info_restr[[1]][,cur_cell_types]
cell_type_info_restr[[2]] = cur_cell_types; cell_type_info_restr[[3]] = length(cur_cell_types)
de_genes <- get_de_genes(cell_type_info_restr, puck, fc_thresh = 1.5, expr_thresh = .0001, MIN_OBS = 3)
## get_de_genes: CD4_T_cells found DE genes: 0
## get_de_genes: CD8_T_cells found DE genes: 0
## get_de_genes: Monocytes_macrophages found DE genes: 1
## get_de_genes: Cancer_cells found DE genes: 9
## get_de_genes: total DE genes: 10
marker_data_de = get_marker_data(cell_type_info_restr[[2]], cell_type_info_restr[[1]], de_genes)
marker_data_de[marker_data_de$cell_type == 'Cancer_cells', ]
##           cell_type      max_epr   log_fc
## CLC    Cancer_cells 0.0004681097 2.391379
## CCL17  Cancer_cells 0.0062723508 1.785208
## TXN    Cancer_cells 0.0033601797 1.717347
## BATF3  Cancer_cells 0.0003970375 1.651421
## DHRS2  Cancer_cells 0.0002454216 1.777520
## IL26   Cancer_cells 0.0001407837 1.728564
## TUBA3C Cancer_cells 0.0001663410 1.646592
## CCL22  Cancer_cells 0.0009392022 1.964587
## TUBB2B Cancer_cells 0.0001732457 1.629272

We can also observe the expression of known marker genes for cancer cells across the predicted cell types.

cancer_genes <- readRDS(file.path(datadir, 'Data/HRS_signature.rds'))
cell_type_info <- RCTD_pred@cell_type_info$info[[1]]
normalized_info <- apply(cell_type_info[intersect(rownames(cell_type_info), cancer_genes), ], MARGIN = 1, FUN = function(x) x/max(x))
heatmap(normalized_info, Colv = NA, Rowv = NA, scale="none")

Immune cell subtypes

We can now explore immune cell subtypes by running full mode on only pixels of a single cell type. We will focus on the three main cell types: CD4 T cells, CD8 T cells, and monocytes/macrophages.

RCTD_list <- readRDS(file.path(datadir, 'Objects/vignette_v1/cancer_RCTD_final.rds'))
RCTD_pred <- RCTD_list[[length(RCTD_list)]]

CD4 T Cells

CD4_RCTD <- initialize.subtypes(RCTD_pred, 'CD4_T_cells', resolution = 1)
CD4_RCTD <- run.subtypes(CD4_RCTD, max_iter = 200, convergence_thresh = 1e-5)
saveRDS(CD4_RCTD, file.path(datadir, 'Objects/vignette_v1/CD4_RCTD.rds'))

CD8 T Cells

CD8_RCTD <- initialize.subtypes(RCTD_pred, 'CD8_T_cells', resolution = 1)
CD8_RCTD <- run.subtypes(CD8_RCTD, max_iter = 200, convergence_thresh = 1e-5)
saveRDS(CD8_RCTD, file.path(datadir, 'Objects/vignette_v1/CD8_RCTD.rds'))

We can spatially plot cell subtypes.

predicted_plot <- function(x) {
  plot_puck_continuous(RCTD_pred@originalSpatialRNA, colnames(RCTD_pred@originalSpatialRNA@counts), RCTD_pred@results$weights[, x], size = 0.1, my_pal = pals::brewer.blues(20)[2:20], title = sprintf('Cluster %s', x))
} 

CD4_RCTD <- readRDS(file.path(datadir, 'Objects/vignette_v1/CD4_RCTD.rds'))
RCTD_pred <- CD4_RCTD[[length(CD4_RCTD)]]
plots = list()

for (i in 0:5) {
  plots[[i+1]] <- predicted_plot(as.character(i))
}

ggarrange(plotlist = plots)

CD8_RCTD <- readRDS(file.path(datadir, 'Objects/vignette_v1/CD8_RCTD.rds'))
RCTD_pred <- CD8_RCTD[[length(CD8_RCTD)]]
plots = list()

for (i in 0:4) {
  plots[[i+1]] <- predicted_plot(as.character(i))
}

ggarrange(plotlist = plots)